from pickle import FALSE

from botasaurus import *
import random
import time
from urllib.parse import urljoin
import re
import csv
import os

from botasaurus.browser_decorator import browser
from botasaurus_driver.user_agent import UserAgent


def save_results_to_csv(results, country_suffix):
    """
    将结果保存到 CSV 文件
    """
    output_file = f"city_urls_{country_suffix}.csv"
    try:
        with open(output_file, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=["city", "url"])
            writer.writeheader()  # 写入表头
            for result in results:
                writer.writerow(result)
        # 验证 CSV 文件是否存在
        if os.path.exists(output_file):
            print(f"Data successfully saved to {output_file}")
        else:
            print("Failed to save CSV file.")
    except Exception as e:
        print(f"Error saving CSV file: {e}")


# 配置 Botasaurus 爬虫
@browser(
    user_agent= UserAgent.REAL,  # 随机 User-Agent
    block_images=True,  # 阻止图片加载，加速爬取
    lang="zh-CN",  # 设置语言为中文，匹配目标网站
    reuse_driver=True,  # 复用浏览器实例，减少开销
    close_on_crash=True,  # 崩溃时关闭浏览器
    create_error_logs=True,  # 记录错误日志
    chrome_executable_path=r"C:\Program Files\Google\Chrome\Application\chrome.exe",  # 添加 Chrome 路径
)
def scrape_city_urls(driver, data):
    # 目标页面列表
    base_urls = data.get("urls", [])
    
    # 初始化结果列表
    all_results = []
    
    # 定义国家映射关系
    country_mapping = {
        "Uzbekistan": "Uzbekistan"
    }
    
    # 循环处理每个URL
    for i, base_url in enumerate(base_urls):
        print(f"Processing URL {i+1}/{len(base_urls)}: {base_url}")
        
        # 访问页面
        try:
            driver.get(base_url)
            # 增加初始加载等待时间
            time.sleep(random.uniform(20, 30))  # 初始加载等待
            # 智能等待：等待页面上的特定元素出现
            driver.wait_for_element("div.geo_name a", 120)  # 等待120秒
        except Exception as e:
            print(f"Failed to load page {base_url}: {e}")
            continue

        # 提取城市和 URL
        results = []
        try:
            # 优化选择器：匹配包含 /Restaurants-g 的 <a> 标签
            city_elements = driver.select_all("div.geo_name a")
            for element in city_elements:
                city_name = element.text.strip()
                city_url = element.get_attribute("href")
                if city_url and city_name:
                    # 确保 URL 是完整的
                    city_url = urljoin(base_url, city_url)
                    # 验证 URL 是否有效
                    if re.match(r"https://cn\.tripadvisor\.com/Restaurants-g\d+-.+\.html", city_url):
                        results.append({
                            "city": city_name,
                            "url": city_url
                        })
        except Exception as e:
            print(f"Error extracting city URLs: {e}")
            continue

        # 验证提取的数据
        if not results:
            print("No city URLs found. Check CSS selector or page structure.")
            continue

        # 验证城市名称和 URL 的完整性
        for result in results:
            if not result["city"]:
                print(f"Warning: Empty city name for URL {result['url']}")
            if not result["url"].startswith("https://cn.tripadvisor.com/Restaurants-"):
                print(f"Warning: Invalid URL format: {result['url']}")

        # 从URL中提取国家名作为文件后缀
        country_suffix = ""
        for country, suffix in country_mapping.items():
            if country in base_url:
                country_suffix = suffix
                break
        
        # 如果没有匹配到国家，使用索引作为后缀
        if not country_suffix:
            country_suffix = str(i+1)

        # 保存结果到 CSV 文件
        save_results_to_csv(results, country_suffix)
        
        # 将结果添加到总结果列表中
        all_results.extend(results)
        
        # 手动控制频率：确保每分钟不超过 6 次请求（至少 10 秒/请求）
        time.sleep(10)
    
    return all_results



# 运行爬虫
if __name__ == "__main__":
    # 定义要爬取的URL列表
    urls = [
        "https://cn.tripadvisor.com/Restaurants-g293967-Uzbekistan.html"
    ]
    
    # 启动爬虫任务
    start_time = time.time()
    results = scrape_city_urls(data={"urls": urls})

    # 验证结果
    if results:
        print(f"Successfully extracted {len(results)} city URLs:")
        for result in results[:5]:  # 打印前 5 条结果以避免过长输出
            print(f"City: {result['city']}, URL: {result['url']}")
        if len(results) > 5:
            print(f"... and {len(results) - 5} more entries.")
    else:
        print("No results extracted. Please check the crawler.")

    # 手动控制频率：确保每分钟不超过 6 次请求（至少 10 秒/请求）
    elapsed_time = time.time() - start_time
    if elapsed_time < 10:  # 如果处理时间少于 10 秒，等待补足
        time.sleep(10 - elapsed_time)
